import pandas as pd
import difflib

df = pd.read_excel('../../etc/nodes_relation.xlsx')

df = df[['表中文名', '字段中文名', '字段中文名入库ES']]
df.dropna(axis=0, inplace=True)


def string_sim(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).quick_ratio()


strings = df['字段中文名'].values
synonyms = []
for i, s1 in enumerate(strings):
    synonym = []
    for j, s2 in enumerate(strings):
        if i != j:
            sim = string_sim(s1, s2)
            if sim > 0.5:
                synonym.append(df['表中文名'].values[j] + ':' + s2)
    synonyms.append(synonym)

df['同义属性'] = synonyms
df.to_csv('synonym.csv', index=False, encoding='utf_8_sig')
print('done')
